Supp Material for Luminous Ostracod Vargula tsujii and Non-Luminous Ostracod Skogsbergia sp.
The R Juypter Notebook serves as a comprehensive repository encompasing most of the scripts and figures relevant to the luminous ostracod Vargula tsujii and non-luminous ostracod Skogsbergia sp. analyses outlined in the publication. This notebook includes the following analyses: QC steps, WGCNA, Network Visualization, Network Connectivity, Differential Gene Expression and GO enrichments.
Author: Lisa Yeter Mesrop
#load libraries
library(WGCNA)
library(tidyverse)
library(edgeR)
library(matrixStats)
library(DESeq2)
library(dplyr)
library(readxl)
library(data.table)
library(ggplot2)
library(ggrepel)
library(repr)
library(topGO)
library(reshape2)
library(plyr)
library(scales)
library(readxl)
library(repr)
library(RColorBrewer)
library(pheatmap)
library(flashClust)
library(ggvenn)
#always use the following WGCNA functions
options(stringsAsFactors = FALSE)
enableWGCNAThreads()
allowWGCNAThreads(nThreads = 22)
The initial gene expression matrix consists of 63 samples and 80911 genes before quality control (QC).
#read in the merged counts
merged_counts <- read.csv("July2024_updated_combined.csv", header = TRUE, sep = ",")
#set the first row as row names and remove it from the data frame
row.names(merged_counts) <- merged_counts[, 1]
merged_counts <- merged_counts[, -1]
#create the metadata
meta <- data.frame(names = colnames(merged_counts))
print(meta$names)
sample_name = c("A", "A", "A", "AIF", "AIF", "AIF", "AII", "AII", "AII", "AIII", "AIII", "AIII", "AIM", "AIM", "AIM", "AIV", "AIV", "AIV","AV", "AV", "AV", "B", "B", "B",
"C", "C", "C", "E", "E", "E", "F", "F", "F", "G", "H", "H", "M", "M","M", "upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip",
"upper_lip", "upper_lip", "eye", "gut", "upper_lip","eye", "gut", "upper_lip", "eye", "gut","upper_lip","eye", "gut", "upper_lip","eye", "gut")
meta$sample_name <- sample_name
As a preliminary quality control (QC) measure for WGCNA analysis, the overall similarity between samples and transcripts with low counts was assessed, as these counts often introduce noise in co-expression analyses. A filter was applied to the expression matrix, removing transcripts with fewer than 5 counts in more than 3 samples, given that some sample types included a minimum of 3 biological replicates. The QC analyses utilized functions from the DESeq2 package (Love et al., 2014).
#import count table, meta and sample_name objects into a DESeq2 object
dds_count_table <- DESeqDataSetFromMatrix(countData = merged_counts, colData = meta, design = ~sample_name)
#determine the number of prefiltered counts for each sample
colSums(assay(dds_count_table))
#the following six samples were removed due to very low counts: M1.1.counts.tab, M2.1.counts.tab, M3.1.counts.tab, AV.1.counts.tab, AV.2.counts.tab, AV.4.counts.tab
#update the merged counts, meta and sample_name objects with the six samples removed
merged_counts_subset <- subset(merged_counts, select= -c(M1.1.counts.tab, M2.1.counts.tab, M3.1.counts.tab, AV.1.counts.tab, AV.2.counts.tab, AV.4.counts.tab))
meta_subset <- data.frame(row.names = colnames(merged_counts_subset))
sample_name_subset = sample_name = c("A", "A", "A", "AIF", "AIF", "AIF", "AII", "AII", "AII", "AIII", "AIII", "AIII", "AIM", "AIM", "AIM", "AIV", "AIV", "AIV", "B", "B", "B",
"C", "C", "C", "E", "E", "E", "F", "F", "F", "G", "H", "H", "upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip",
"upper_lip", "upper_lip", "eye", "gut", "upper_lip","eye", "gut", "upper_lip", "eye", "gut","upper_lip","eye", "gut", "upper_lip","eye", "gut")
meta_subset$sample_name_subset <- sample_name_subset
meta_subset$names_subset <- rownames(meta_subset)
rownames(meta_subset) <- NULL
nrow(meta_subset)
Perform variance-stabilizing transformation using DEseq2 package (Love et al., 2014). The authors of WGCNA recommend variance-stabilizing transformation as a normalization method before conducting network analyses (Langfelder and Horvath, 2008).
#import updated count table, meta and sample_name objects into a DESeq2 object
dds_count_table_subset<- DESeqDataSetFromMatrix(countData = merged_counts_subset, colData = meta_subset, design = ~sample_name_subset)
#filter the count table
dds_merged_table_prefiltered <- dds_count_table_subset[rowSums(counts(dds_count_table_subset) >=5) >=3,]
nrow(dds_merged_table_prefiltered)
#run DESeq2 function with variance-stabilizing transformation
dds_subset <- DESeq(dds_merged_table_prefiltered, betaPrior = FALSE, parallel = TRUE)
#perform a variance-stabilizing transformation
vsd_subset <- varianceStabilizingTransformation(dds_subset)
#perform PCA
pcaData <- plotPCA(vsd_subset, intgroup="sample_name_subset", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=sample_name_subset, shape=sample_name_subset)) +
geom_point(size=5) +
labs(color = "Sample Types")+ labs(shape = "Sample Types")+
scale_shape_manual(values = c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16))+
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+theme(
panel.grid.major = element_line(colour = "gray97", size = 1),
panel.grid.minor = element_line(linetype = "dotted"),
panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = "gray100"),
axis.line = element_line(size = 0.5, linetype = "solid"),
panel.border = element_rect(colour = "black", fill = NA, size = 1)
) + theme(
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
legend.title = element_text(size = 16),
legend.text = element_text(size = 12)
)
After completing the prefiltering and normalization steps above, the count matrix now includes 57 samples and 33479 transcript and is ready for WGCNA. WGCNA (Weighted Gene Co-expression Network Analysis) identifies clusters (modules) of highly correlated genes by constructing a network based on pairwise correlations between gene expression profiles (Langfelder and Horvath, 2008). These modules often correspond to specific biological processes or pathways, indicating that the genes within a module may be part of the same regulatory process. By analyzing the connectivity of genes within each module, WGCNA also helps identify key drivers or hub genes, providing insights into gene regulation and the biological processes they govern. The following scripts are from the WGCNA package (Langfelder and Horvath, 2008).
#use the prefiltered and normalized count matrix from the DEseq2
vsd_subset_matrix <- assay(vsd_subset)
## many functions expect the matrix to be transposed
datExpr <- t(vsd_subset_matrix)
## check rows/cols
nrow(datExpr)
ncol(datExpr)
#ready to start WGCNA Analysis
#run this to check if there are gene outliers
gsg=goodSamplesGenes(datExpr, verbose = 3)
gsg$allOK
# choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# plot the results:
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
# co-expression similarity and adjacency using assigned softpower
softPower=3
adjacency = adjacency(datExpr, power = softPower)
# calculate the Topological Overlap Matrix (TOM)
# turn adjacency into topological overlap, i.e. translate the adjacency into
# topological overlap matrix and calculate the corresponding dissimilarity
TOM = TOMsimilarity(adjacency, TOMType = "signed", verbose = 5);
dissTOM = 1-TOM;
# call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# plot the resulting clustering tree (dendrogram)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# set the min module size
minModuleSize = 30;
# module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 3, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
# calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# plot the result
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.05
# plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# call an automatic merging function
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# merged module colors
mergedColors = merge$colors;
# eigengenes of the new merged modules
mergedMEs = merge$newMEs;
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#note, dynamicColors and mergedColors are the same in this case
table(mergedColors)
The functionally demonstrated c-luciferase gene, along with some phylogenetically similar c-luciferase genes, are located in the pink module, which we refer to as the Bioluminescent Co-Regulatory Network (BCN).
#determine which column number in the datExpr object that corresponds to c-luciferase
which(colnames(datExpr) == "NODE_10321_length_1972_cov_1770.41_g3092_i1")
#determine the color of the module with c-luciferase
dynamicColors[[409]]
# extract all modules as a table for downstream analyses
SubGeneNames = colnames(datExpr)
for (color in dynamicColors){
module=SubGeneNames[which(dynamicColors==color)]
write.table(module, paste("module_",color, ".txt",sep=""), sep="\t", row.names=FALSE, col.names=FALSE,quote=FALSE)
}
# extract the pink module of interest from the dynamicColors object
SubGeneNames = colnames(datExpr)
pink=as.data.frame(SubGeneNames[which(dynamicColors=="pink")])
names(pink)[1] <- "transcript_id"
head(pink)
#read in the Trinotate sheet for Vargula tsujii.
Trinotate_lym_subset <- read_excel("Trinotate_lym_subset.xlsx")
#incorporate annotation for the pink module (the BCN)
pink_module_gene_annot_deesplit3 <- setDT(Trinotate_lym_subset, key = 'transcript_id')[J(pink)]
#write.csv(pink_module_gene_annot_deesplit3, file = "pink_module_gene_annot_power3_deepsplit3.csv")
#extract the module that is significantly correlated with the eye sample (from Section 4.8) from the dynamicColors object
BCN_lightcyan_EYE=as.data.frame(SubGeneNames[which(dynamicColors=="lightcyan")])
names(BCN_lightcyan_EYE)[1] <- "transcript_id"
BCN_lightcyan_EYE_gene_annot <- setDT(Trinotate_lym_subset, key = 'transcript_id')[J(BCN_lightcyan_EYE)]
#write.csv(BCN_lightcyan_EYE_gene_annot, file = "BCN_lightcyan_EYE_gene_annot.csv")
#extract the module that is correlated with gut (from Section 4.8) from the dynamicColors object
BCN_yellow_GUT=as.data.frame(SubGeneNames[which(dynamicColors=="yellow")])
names(BCN_yellow_GUT)[1] <- "transcript_id"
#match the annotation with each transcript
BCN_yellow_GUT_gene_annot <- setDT(Trinotate_lym_subset, key = 'transcript_id')[J(BCN_yellow_GUT)]
#write.csv(BCN_yellow_GUT_gene_annot, file = "BCN_yellow_GUT_gene_annot.csv")
Identify modules (network) that are significantly associated with samples. The module eigengene provides a representative measure of the gene expression patterns within a module, allowing correlation with these traits to determine the most significant associations (Langfelder and Horvath, 2008). This correlation can then be visualized in a module-to-trait heatmap to determine the most significant associations.The following scripts are from the WGCNA package (Langfelder and Horvath, 2008).
traitData <- read_excel("traits_all_tsujii_updated_pink_deepsplit3.xlsx")
traitData <- as.data.frame(traitData)
names(traitData)
head(traitData)
allTraits = traitData
head(allTraits)
names(allTraits)
All_samples = rownames(datExpr)
traitRows = match(All_samples, allTraits$Samples)
traitRows
datTraits = allTraits[traitRows, -1]
rownames(datTraits) = allTraits[traitRows, 1]
# double check that row names agree
table(rownames(datTraits)==rownames(datExpr))
datTraits$Samples_names <- NULL
head(datTraits)
# sample network based on squared Euclidean distance
A=adjacency(t(datExpr),type="distance")
# this calculates the whole network connectivity
k=as.numeric(apply(A,2,sum))-1
# standardized connectivity
Z.k=scale(k)
# designate samples as outlying
# if their Z.k value is below the threshold
thresholdZ.k=-5 # often -2.5
# the color vector indicates outlyingness (red)
outlierColor=ifelse(Z.k<thresholdZ.k,"red","black")
# calculate the cluster tree using flahsClust or hclust
sampleTree = flashClust(as.dist(1-A), method = "average")
# convert traits to a color representation:
# where red indicates high values
traitColors=data.frame(numbers2colors(datTraits,signed=FALSE))
dimnames(traitColors)[[2]]=paste(names(datTraits),"",sep="")
datColors=data.frame(outlierC=outlierColor,traitColors)
# choose a module assignment
moduleColors_Vtsujii=dynamicColors
# define numbers of genes and samples
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr,moduleColors_Vtsujii)$eigengenes
MEsFemale = orderMEs(MEs0)
modTraitCor = cor(MEsFemale, datTraits, use = "p")
modTraitP = corPvalueStudent(modTraitCor, nSamples)
#color code each association by the correlation value; display correlations and their p-values
textMatrix = paste(signif(modTraitCor, 2), "\n(",
signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
par(mar = c(8, 8.5, 3, 3))
# display the correlation values within a heatmap plot
labeledHeatmap(Matrix = modTraitCor, xLabels = names(datTraits),
yLabels = names(MEsFemale), ySymbols = names(MEsFemale),
colorLabels =FALSE,colors = blueWhiteRed(50),textMatrix=textMatrix,
setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1),
main = paste("Module-trait relationships"))
The Bioluminescent Co-regulatory Network (BCN) has over 900 co-expressed transcripts making it challenging to visualize the entire network topology comprehensively. To address this, we used Cytoscape to visualize the network and we subsetted the network to only include genes that are signifcantly upregulated (uniquely) in the bioluminescent upper lip from Section 9.4.
Export the network of interest into edge and node list files for Cytoscape (Shannon et al., 2003).
# select module of interest
module = "pink"
# select module probes
genes = colnames(datExpr)
inModule = dynamicColors==module
modProbes = genes[inModule]
# select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)
# export the network into edge and node list files Cytoscape
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = paste("BCN-CytoscapeInput-edges-pink-deepslit3", paste(module, collapse="-"), ".txt", sep=""),
nodeFile = paste("BCN-CytoscapeInput-nodes-pink-deepsplit3", paste(module, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
altNodeNames = modProbes,
nodeAttr = dynamicColors[inModule])
#for visualization of the BCN in Cytoscape, determine how many of the significantly upregulated genes expressed in the bioluminescent upper lip (from Section 9.4) are found in the BCN.
BCN_DE_unique_Bio_UpperLip <- pink %>%
filter(transcript_id %in% unique_genes_bio_upper_lip_info$transcript_id)
#add annotation
BCN_DE_unique_Bio_UpperLip_annot <- left_join(BCN_DE_unique_Bio_UpperLip,Trinotate_lym_subset,by="transcript_id")
GO enrichment analyses was performed using topGO package (Alexa Rahnenfuhrer, 2024). The figures representing the GO enrichments presented here were utilized for analysis, but were not included in the main text of the publication. The GO enrichments were visualized with GO-Figure! package for publication (Reijnders and Waterhouse, 2021). GO-Figure! reference https://github.com/lmesrop/BCN_publication/tree/main/Go-Figure!.
#import the trinotate go sheet from the Trinotate output
geneID2GO <- readMappings(file ="Trinotate_go_lym.txt")
geneNames <- names(geneID2GO)
#save the transcript ids of all the annotated genes under geneNames object
geneNames<- as.character(Trinotate_lym_subset$transcript_id)
#save the transcript
myInterestingGenes= as.character(pink[,1])
#subset the genesNames by the transcript IDs in my red module
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
head(geneList)
#run the topGO function.
GOdata <- new("topGOdata", ontology = "BP", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
results_go <- runTest(GOdata, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment <- GenTable(GOdata, Fisher = results_go, orderBy = "Fisher", topNodes = 500, numChar=1000)
#graph the GO enrichment
goEnrichment$Fisher <- as.numeric(goEnrichment$Fisher)
goEnrichment <- goEnrichment[goEnrichment$Fisher < 0.05,]
#extract transcript ids that are significantly enriched in the BCN
myterms =goEnrichment$GO.ID
mygenes = genesInTerm(GOdata, myterms)
#extract the transcript ids for each GO term
var=c()
for (i in 1:length(myterms))
{
myterm <- myterms[i]
mygenesforterm <- mygenes[myterm][[1]]
myfactor <- mygenesforterm %in% myInterestingGenes
mygenesforterm2 <- mygenesforterm[myfactor == TRUE]
mygenesforterm2 <- paste(mygenesforterm2, collapse=',')
var[i]=paste(myterm,"genes:",mygenesforterm2)
}
#plot the GO enrichment results
ntop = 135
ggdata <- goEnrichment[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_BCN_GO_BP <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
ylim(-1,21) +
geom_point(shape = 21) +
scale_size(range = c(2.5,12.5)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO enrichment - BP - BCN')+
theme_bw(base_size = 20) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 20, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
axis.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
#Legend
legend.key = element_blank(),
legend.text = element_text(size = 16, face = "bold"))+
coord_flip()
plot_BCN_GO_BP + labs(x = NULL)
#function from the R package repr to visualize figures in Jupyter Notebook(optional)
options(repr.plot.width=14, repr.plot.height=20, repr.plot.res = 500)
#run the topGO function
GOdata_MF <- new("topGOdata", ontology = "MF", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
results_go_MF <- runTest(GOdata_MF, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment_MF <- GenTable(GOdata_MF, Fisher = results_go_MF, orderBy = "Fisher", topNodes = 100, numChar=1000)
goEnrichment_MF$Fisher <- as.numeric(goEnrichment_MF$Fisher)
#plot GO MF enchriment plot
ntop = 65
ggdata <- goEnrichment_MF[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
GO_MF_BCN_plot <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
ylim(-1,21) +
geom_point(shape = 21) +
scale_size(range = c(2.5,12.5)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - MF - BCN')+
theme_bw(base_size = 20) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 20, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
axis.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
#Legend
legend.key = element_blank(),
legend.text = element_text(size = 16, face = "bold"))+
coord_flip()
GO_MF_BCN_plot + labs(x=NULL)
#function from the R package repr to visualize figures in Jupyter Notebook(optional)
options(repr.plot.width=18, repr.plot.height=12, repr.plot.res = 500)
WGCNA identifies genes integral to a (module) network using a network characteristic called module membership (Langfelder and Horvath, 2008). Module membership represents connectivity of a gene with other genes within a module and is used to define centralised hub genes (Langfelder and Horvath, 2008). Module membership (MM) is a measure ranging from 0 to 1, with higher values indicating strong connectivity within a module and lower values indicating weak connectivity (Langfelder and Horvath, 2008). Genes with similar expression patterns within a module are not only correlated with the module's overall expression (MM) but also tightly interconnected with other genes in the same module (IC) (Langfelder and Horvath, 2008). In other words, genes that are strongly correlated with the overall expression pattern of their modules (high MM) are also highly connected to other genes within those modules (high IC). There is a strong correlation between intramodule connectivity and module membership (Langfelder and Horvath, 2008).
#select the same power used for WGCNA analysis in Section 4.3
ADJ1=abs(cor(datExpr,use="p"))^3
Alldegrees1=intramodularConnectivity(ADJ1, dynamicColors)
head(Alldegrees1)
datME=moduleEigengenes(datExpr,dynamicColors)$eigengenes
signif(cor(datME, use="p"), 2)
datKME =signedKME(datExpr, datME, outputColumnName="MM.")
head(datKME)
nrow(datKME)
#determine the module membership vs intramodular connectivity for the pink module (the BCN)
which.color="pink"
restrictGenes=dynamicColors==which.color
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
(datKME[restrictGenes, paste("MM.", which.color, sep="")])^10,
col=which.color,
xlab="Intramodular Connectivity",
ylab="(Module Membership)^3")
#function from the R package repr to visualize figures in Jupyter Notebook(optional)
options(repr.plot.width=4, repr.plot.height=3)
Identify all co-expressed transcripts within the BCN (red module) that have a high module membership (MM > 0.8). An MM value of 0.8 or higher indicates a strong association with the module eigengene, suggesting that the gene plays a central role in the module's network and can be considered a hub gene (Langfelder and Horvath, 2008).
combined_datKME_ADJ1 = merge(datKME, Alldegrees1, by=0)
head(combined_datKME_ADJ1)
subset_combined_datKME_ADJ1 = dplyr::select(combined_datKME_ADJ1, c("Row.names","MM.pink", "kWithin","kTotal"))
head(subset_combined_datKME_ADJ1)
#make the transcript ids rownames again
row.names(subset_combined_datKME_ADJ1) <- subset_combined_datKME_ADJ1$Row.names
subset_combined_datKME_ADJ1[1] <- NULL
head(subset_combined_datKME_ADJ1)
#subset the subset_combined_datKME_ADJ1 to include just transcripts from the red module (the BCN)
top_datKME_ADJ1_.8 <-subset_combined_datKME_ADJ1 %>% dplyr::filter(subset_combined_datKME_ADJ1$MM.pink >= 0.79)
head(top_datKME_ADJ1_.8)
#make rowname a column for downstream subsetting
top_datKME_ADJ1_.8$transcript_id <- rownames(top_datKME_ADJ1_.8)
head(top_datKME_ADJ1_.8)
#reorganize the column orders
top_datKME_ADJ1_.8_column_reordered <- top_datKME_ADJ1_.8[, c("transcript_id", "MM.pink", "kWithin", "kTotal")]
head(top_datKME_ADJ1_.8_column_reordered)
# reorder spreadsheet based on MM.red column from largest to smallest
top_datKME_ADJ1_.8_column_reordered_desc <-top_datKME_ADJ1_.8_column_reordered %>% dplyr::arrange(desc(kWithin))
head(top_datKME_ADJ1_.8_column_reordered_desc)
#annotate the top datKME genes
top_datKME_ADJ1_.8_column_reordered_desc_annot_trinotate <- left_join(top_datKME_ADJ1_.8_column_reordered_desc,Trinotate_lym_subset,by="transcript_id")
head(top_datKME_ADJ1_.8_column_reordered_desc_annot_trinotate)
#write.csv(top_datKME_ADJ1_.8_column_reordered_desc_annot_trinotate, file = "081224_top_datKME_ADJ1_.8_desc_BCN_pink_splitdeep3.csv")
Use the results from the DGE analysis in Section Step 9.4 to identify the significantly upregulated genes (expressed uniquely) with the highest module membership (MM > 0.8).
#determine how many of the co-expressed genes in the BCN with MM > 0.8 are significantly upregulated in the bioluminescent upper lip
top_datKME_ADJ1_.8_column_reordered_desc_annot_DE_bio_upper_lip <- top_datKME_ADJ1_.8_column_reordered_desc_annot_trinotate %>%
inner_join(unique_genes_bio_upper_lip_info, by = "transcript_id")
#join
top_datKME_ADJ1_.8_column_reordered_desc_annot_DE_bio_upper_lip_logfold <- left_join(top_datKME_ADJ1_.8_column_reordered_desc_annot_DE_bio_upper_lip ,unique_genes_bio_upper_lip_info,by="transcript_id")
Import V.tsujii gene expression matrix which consists of three tissue types - upper lip, compound eye and gut - with five biological replicates for each tissue. Generate the sample name sheet (meta sheet) for downstream DESeq2 analyses.
#read in the gene expression matrix
organ_level_vt <- read.delim("combined_vargula_tsujii.tab", header = TRUE, sep = "\t", quote = "")
head(organ_level_vt)
#remove the extra column that got inserted
row.names(organ_level_vt) <-organ_level_vt$X
organ_level_vt[1]<-NULL
organ_level_vt$X.1 <- NULL
#import the sample sheet
sampleinfo <- read_excel("sampleinfo_vtsujii_DGE.xlsx")
sampleinfo
#generate the DESeq data set
dds_count_table_organ_level <- DESeqDataSetFromMatrix(countData = organ_level_vt, colData = sampleinfo, design = ~group)
#run DESeq2 function and normalization
dds_vtsujii <- DESeq(dds_count_table_organ_level, betaPrior = FALSE, parallel = TRUE)
#perform a variance-stabilizing transformation
dds_vtsujii_vsd <- varianceStabilizingTransformation(dds_vtsujii)
#transpose the matrix
sampleDists_dds_vtsujii_vsd <- dist(t(assay(dds_vtsujii_vsd)))
#plot the heatmap
sampleDistMatrix_dds_vtsujii_vsd <- as.matrix(sampleDists_dds_vtsujii_vsd)
rownames(sampleDistMatrix_dds_vtsujii_vsd) <- paste(colData(dds_count_table_organ_level)$group)
colnames(sampleDistMatrix_dds_vtsujii_vsd) <- colData(dds_count_table_organ_level)$samples
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix_dds_vtsujii_vsd,
clustering_distance_rows=sampleDists_dds_vtsujii_vsd,
clustering_distance_cols=sampleDists_dds_vtsujii_vsd,
col=colors)
options(repr.plot.width=8, repr.plot.height=6, repr.plot.res = 150)
pcaData <- plotPCA(dds_vtsujii_vsd, intgroup="group", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=group, shape=group)) +
labs(color = "Tissue Types")+ labs(shape = "Tissue Types")+
geom_point(size=5) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_point(size=5) +
theme(panel.grid.major = element_line(colour = "gray97", size = 1), panel.grid.minor = element_line(linetype = "dotted"), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = "gray100")) + theme(axis.line = element_line(size = 0.5,linetype = "solid")) +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
coord_fixed() +
scale_color_manual(values = c('#A5D6A7','#CE93D8', '#81D4FA'))+theme(
legend.title = element_text(size = 16),
legend.text = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 12)
)
Differential gene expression analysis was carried out in DESeq2 (Love et al., 2014). For Vargula tsujii, we determined differentially upregulated genes in three tissue types - bioluminescent upper lip, compound eye and gut - using five biological replicates for each tissue. DESeq2 was employed using a p-value < 0.05 and FC > 1.5 for the significance of differentially expressed genes using the Benjamini-Hochberg method to account for false discovery rate (FDR). Pairwise comparisons were done across tissue types (i.e., bioluminescent upper lip to compound eye, bioluminescent upper lip to gut, gut to compound eye). To determine tissue-specific differential upregulation, each tissue was compared to the other two (e.g., genes upregulated uniquely in the bioluminescent upper lip were determined by comparing upper lip expression to both compound eye and gut). For each pairwise comparison, the reference tissue was specified to determine the significantly upregulated genes in each tissue type (i.e., positive vs negative log2fold change). For example, in the bioluminescent upper lip vs eye comparison, if a gene has a positive log2fold change, then that gene is upregulated in the bioluminescent upper lip compared to the eye. Conversely, if a gene has a negative log2fold change then the gene is upregulated in eye compared to the upper lip. Genes that were upregulated uniquely in each tissue type were used for all downstream analyses.
#run DESeq2 function and normalization
dds_vtsujii <- DESeq(dds_count_table_organ_level)
#set Eyes as reference tissue
dds_vtsujii$group <- relevel(dds_vtsujii$group, ref= "Eyes")
#rerun DESeq command after reference is specified
dds_vtsujii <- DESeq(dds_vtsujii)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_Gut_Vs_Eye <- results(dds_vtsujii, contrast= c("group", "Gut", "Eyes") , alpha = 0.05)
res_tableOE_Gut_Vs_Eye <- lfcShrink(dds_vtsujii, contrast= c("group", "Gut", "Eyes"), res = res_tableOE_unshrunken_Gut_Vs_Eye )
mcols(res_tableOE_Gut_Vs_Eye, use.names=T)
# set thresholds
# lfc.cutoff value of 0.58 translates to a 1.5-fold change in expression
# padj.cutoff value of 0.05
padj.cutoff <- 0.05
lfc.cutoff <- 0.58
#convert to tibble
res_tableOE_Gut_Vs_Eye_tb <- res_tableOE_Gut_Vs_Eye %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
#determine all differentially expressed genes that meet the cutoff values (i.e., padj.cutoff of 0.05 and lfc.cutoff of 0.58, both upregulated and downregulated)
sigOE_Gut_Vs_Eye <- res_tableOE_Gut_Vs_Eye_tb %>% filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
#extract all genes that are significantly upregulated in the Gut (positive log2 fold change)
sigOE_UPREGULATED_logfold_Gut_vs_Eye <- sigOE_Gut_Vs_Eye %>% filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
head(sigOE_UPREGULATED_logfold_Gut_vs_Eye)
colnames(sigOE_UPREGULATED_logfold_Gut_vs_Eye)[1]<- "transcript_id"
#add annotation
sigOE_UPREGULATED_logfold_Gut_vs_Eye_annot <- left_join(sigOE_UPREGULATED_logfold_Gut_vs_Eye,Trinotate_lym_subset,by="transcript_id")
#extract all genes that are significantly upregulated in the Compound Eye (negative log2fold change) compared to the Gut.
sigOE_DOWNREGULATED_logfold_Gut_vs_Eye <- sigOE_Gut_Vs_Eye %>% filter(padj < padj.cutoff & log2FoldChange < -lfc.cutoff)
#save these two dataframes for downstream analysis in Section 9.4
#genes that are significantly upregulated in the Gut (positive log2fold change)
head(sigOE_UPREGULATED_logfold_Gut_vs_Eye)
#genes that are significantly upregulated in the Compound Eye (negative log2fold change) compared to the Gut.
head(sigOE_DOWNREGULATED_logfold_Gut_vs_Eye)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_Bio_UpperLip_Vs_Eye <- results(dds_vtsujii, contrast= c("group", "Upper_lip", "Eyes") , alpha = 0.05)
res_tableOE_Bio_UpperLip_Vs_Eye <- lfcShrink(dds_vtsujii, contrast= c("group", "Upper_lip", "Eyes"), res = res_tableOE_unshrunken_Bio_UpperLip_Vs_Eye)
mcols(res_tableOE_Bio_UpperLip_Vs_Eye, use.names=T)
#convert to tibble
res_tableOE_Bio_UpperLip_Vs_Eye_tb <- res_tableOE_Bio_UpperLip_Vs_Eye %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
#determine all differentially expressed genes that meet the cutoff values (i.e., padj.cutoff of 0.05 and lfc.cutoff of 0.58, both upregulated and downregulated)
sigOE_Bio_UpperLip_Vs_Eye <- res_tableOE_Bio_UpperLip_Vs_Eye_tb %>% filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
#extract all genes that are significantly upregulated in the Bioluminescent Upper Lip (positive log2 fold change)
sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye <- sigOE_Bio_UpperLip_Vs_Eye %>% filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
head(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)
colnames(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)[1] <- "transcript_id"
#add annotation
sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye_annot <- left_join(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye,Trinotate_lym_subset,by="transcript_id")
#extract all genes that are significantly upregulated in the Compound Eye (negative log2fold change) compared to the Bioluminescent Upper Lip,
sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye <- sigOE_Bio_UpperLip_Vs_Eye %>% filter(padj < padj.cutoff & log2FoldChange < -lfc.cutoff)
colnames(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye)[1] <- "transcript_id"
#add annotation
sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye_annot <- left_join(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye,Trinotate_lym_subset,by="transcript_id")
#save these two dataframes for downstream analysis in Section 9.4
#genes that are significantly upregulated in Bioluminescent Upper Lip (positive log2fold change)
head(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)
#genes that are significantly upregulated in the Compound Eye (negative log2fold change) compared to the Bioluminescent Upper Lip.
head(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye)
#now set gut as reference
dds_vtsujii$group <- relevel(dds_vtsujii$group, ref= "Gut")
#rerun DESeq after setting a new reference
dds_vtsujii <- DESeq(dds_vtsujii)
#Define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_Bio_Upper_Lip_Vs_Gut <- results(dds_vtsujii, contrast= c("group", "Upper_lip", "Gut") , alpha = 0.05)
res_tableOE_Bio_Upper_Lip_Vs_Gut <- lfcShrink(dds_vtsujii, contrast= c("group", "Upper_lip", "Gut"), res = res_tableOE_unshrunken_Bio_Upper_Lip_Vs_Gut )
mcols(res_tableOE_Bio_Upper_Lip_Vs_Gut, use.names=T)
#convert to tibble
res_tableOE_Bio_Upper_Lip_Vs_Gut_tb <- res_tableOE_Bio_Upper_Lip_Vs_Gut %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
#determine all differentially expressed genes that meet the cutoff values (i.e., padj.cutoff of 0.05 and lfc.cutoff of 0.58, both upregulated and downregulated)
sigOE_Bio_Upper_Lip_Vs_Gut <- res_tableOE_Bio_Upper_Lip_Vs_Gut_tb %>% filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
#extract all genes that are significantly upregulated in the Bioluminescent Upper Lip (positive log2 fold change)
sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut <- sigOE_Bio_Upper_Lip_Vs_Gut %>% filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
colnames(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)[1] <- "transcript_id"
#add annotation
sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut_annot <- left_join(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut,Trinotate_lym_subset,by="transcript_id")
#extract all genes that are significantly upregulated in the Gut (negative log2fold change) compared to the Bioluminescent Upper Lip.
sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut <- sigOE_Bio_Upper_Lip_Vs_Gut %>% filter(padj < padj.cutoff & log2FoldChange < -lfc.cutoff)
colnames(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)[1]<- "transcript_id"
#add annotation
sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut_annot <- left_join(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut, Trinotate_lym_subset,by="transcript_id")
#save these two dataframes for downstream analysis in Section 9.3
#genes that are significantly upregulated in the bioluminescent upper lip (positive log2 fold change)
head(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
#genes that are significantly upregulated in the gut (negative log2fold change) compared to the Bioluminescent Upper Lip.
head(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
To identify tissue-specific differential expression (i.e., significantly upregulated genes that are uniquely expressed), each tissue was compared to the other two and tissue-specific genes were extracted from the intersection of the Venn diagram (in Section 9.4.4). Genes that were significantly upregulated uniquely in each tissue type were used for all downstream analyses in the publication.
#combine genes upregulated in the Bioluminescent Upper Lip from pairwise comparisons - Bio Upper Lip vs Gut and Bio Upper Lip vs Eye - into a single dataframe.
head(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
head(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)
colnames(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)[1] <- "gene"
colnames(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)[1] <- "gene"
#merge
merged_upper_lips_df <- rbind(
sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut,
sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye
)
head(merged_upper_lips_df)
#remove gene duplicates while retaining one duplicate due to pairwise comparisons. The same gene can be found in both pairwise comparisons - Bio Upper Lip vs Gut and Bio Upper Lip vs Eye.
merged_upper_lips_unique <- merged_upper_lips_df[!duplicated(merged_upper_lips_df$gene), ]
head(merged_upper_lips_unique)
nrow(merged_upper_lips_unique)
#write.csv(merged_upper_lips_unique, file = "Vtsujii_merged_bio_upper_lip_ALL_DE_genes.csv")
#combine genes upregulated in the compound eyes from the pairwise comparisons - Bio Upper Lip vs Eye and Gut vs Eye.
head(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye)
head(sigOE_DOWNREGULATED_logfold_Gut_vs_Eye)
colnames(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye)[1]<- "gene"
merged_Eye_df <- rbind(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye , sigOE_DOWNREGULATED_logfold_Gut_vs_Eye)
merged_Eye_unique <-merged_Eye_df[!duplicated(merged_Eye_df$gene), ]
head(merged_Eye_unique)
#write.csv(merged_Eye_unique, file = "Vtsujii_merged_Eye_ALL_DE_genes.csv")
#combine genes upregulated in the gut from pairwise comparisons - Gut vs Eye and Bio Upper Lip vs Gut.
head(sigOE_UPREGULATED_logfold_Gut_vs_Eye)
head(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
colnames(sigOE_UPREGULATED_logfold_Gut_vs_Eye)[1] <- "gene"
colnames(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)[1]<- "gene"
merged_Gut_df <- rbind(sigOE_UPREGULATED_logfold_Gut_vs_Eye , sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
merged_Gut_unique <-merged_Gut_df[!duplicated(merged_Gut_df$gene), ]
head(merged_Gut_unique)
#write.csv(merged_Gut_unique, file = "Vtsujii_merged_Gut_ALL_DE_genes.csv")
#generate a venn diagram to visualize shared significantly upregulated genes across tissue types and extract the genes that are unique to each tissue type.
unique_venn_list <- list(
Bio_Upper_Lip = merged_upper_lips_unique$gene ,
Gut = merged_Gut_unique$gene,
Compound_Eye = merged_Eye_unique$gene
)
ggvenn_unique <- ggvenn(
unique_venn_list,
fill_color = c('#81D4FA','#A5D6A7', '#CE93D8'),
stroke_size = .7, set_name_size = 6, text_size = 5
)
ggvenn_unique
# Open a PDF device
pdf("Luminous_DGE.pdf", width = 8, height = 6)
ggvenn_unique
dev.off()
#prep dataframes for extraction
Bio_Upper_Lip <- as.data.frame(merged_upper_lips_unique$gene)
colnames(Bio_Upper_Lip)[1]<- "gene"
Gut <- as.data.frame(merged_Gut_unique$gene)
colnames(Gut)[1]<- "gene"
Compound_eye <- as.data.frame(merged_Eye_unique$gene)
colnames(Compound_eye)[1]<- "gene"
# compare and extract unique genes for each tissue type
unique_genes_bio_upper_lip <- anti_join(Bio_Upper_Lip, Gut, by = "gene") %>%
anti_join(Compound_eye, by = "gene")
unique_genes_gut <- anti_join(Gut, Bio_Upper_Lip, by = "gene") %>%
anti_join(Compound_eye, by = "gene")
unique_genes_compound_eye <- anti_join(Compound_eye, Bio_Upper_Lip, by = "gene") %>%
anti_join(Gut, by = "gene")
nrow(unique_genes_bio_upper_lip)
nrow(unique_genes_gut)
nrow(unique_genes_compound_eye)
#add the annotations back to the unique genes in each tissue type by subsetting
unique_genes_bio_upper_lip_info <- merged_upper_lips_unique %>%
filter(gene %in% unique_genes_bio_upper_lip$gene)
unique_genes_eye_info <- merged_Eye_unique %>%
filter(gene %in% unique_genes_compound_eye$gene)
unique_genes_gut_info <- merged_Gut_unique %>%
filter(gene %in% unique_genes_gut$gene)
#add annotations back
colnames(unique_genes_bio_upper_lip_info)[1]<- "transcript_id"
unique_genes_bio_upper_lip_info_annot <- left_join(unique_genes_bio_upper_lip_info,Trinotate_lym_subset,by="transcript_id")
#add annotations back
colnames(unique_genes_eye_info)[1]<- "transcript_id"
unique_genes_eye_info_annot <- left_join(unique_genes_eye_info,Trinotate_lym_subset,by="transcript_id")
#add annotations backs
colnames(unique_genes_gut_info)[1] <- "transcript_id"
unique_genes_gut_info_annot <- left_join(unique_genes_gut_info,Trinotate_lym_subset,by="transcript_id")
#write.csv(unique_genes_bio_upper_lip_info_annot, file = "df_Vtsujii_sigfig_upreg_unique_BioUpperLip.csv")
#write.csv(unique_genes_eye_info_annot, file = "df_Vtsujii_sigfig_upreg_unique_comEye.csv")
#write.csv(unique_genes_gut_info_annot, file = "df_Vtsujii_sigfig_upreg_unique_Gut.csv")
The topGO package was used to perform GO enrichment analysis on significantly upregulated genes (i.e., uniquely expressed) in the bioluminescent upper lip. (Alexa Rahnenfuhrer, 2024). The figures representing the GO enrichments presented here were utilized for analysis, but were not included in the main text of the publication. The GO enrichments were visualized with GO-Figure! package for publication (Reijnders and Waterhouse, 2021). GO-Figure! reference https://github.com/lmesrop/BCN_publication/tree/main/Go-Figure!.
#import the trinotate go sheet from Trinotate output
geneID2GO_bio <- readMappings(file ="Trinotate_go_lym.txt")
#significantly upregulated genes (i.e. expressed uniquely) in the bioluminescent upper lip
head(unique_genes_bio_upper_lip_info)
#save the transcript ids of all the annotated genes under geneNames object
geneNames_bio<- as.character(Trinotate_lym_subset$transcript_id)
#save the transcript
myInterestingGenes_bio= as.character(unique_genes_bio_upper_lip_info$transcript_id)
#subset the genesNames by the transcript IDs in my red module
geneList_bio <- factor(as.integer(geneNames_bio %in% myInterestingGenes_bio))
names(geneList_bio) <- geneNames_bio
head(geneList_bio)
#run the topGO function.
GOdata_bio <- new("topGOdata", ontology = "BP", allGenes = geneList_bio,
annot = annFUN.gene2GO, gene2GO = geneID2GO_bio)
results_go_bio <- runTest(GOdata_bio, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment_bio <- GenTable(GOdata_bio, Fisher = results_go_bio, orderBy = "Fisher", topNodes =100, numChar =1000 )
goEnrichment_bio$Fisher <- as.numeric(goEnrichment_bio$Fisher)
goEnrichment_bio <- goEnrichment_bio[goEnrichment_bio$Fisher < 0.05,]
goEnrichment_bio <- goEnrichment_bio[goEnrichment_bio$Significant > 1,]
#write.table(goEnrichment_bio, "df_TopGO_Vargula_tsujii_DE_unique_BioUpperLip_BP.tsv",sep = "\t", quote=FALSE)
myterms_bio =goEnrichment_bio$GO.ID
mygenes_bio = genesInTerm(GOdata_bio, myterms_bio)
#extract the transcript ids for each GO term
for (i in 1:length(myterms_bio))
{
myterm_bio <- myterms_bio[i]
mygenesforterm_bio <- mygenes_bio[myterm_bio][[1]]
myfactor_bio <- mygenesforterm_bio %in% myInterestingGenes_bio
mygenesforterm2_bio <- mygenesforterm_bio[myfactor_bio == TRUE]
mygenesforterm2_bio <- paste(mygenesforterm2_bio, collapse=',')
print(paste("Term",myterm_bio,"genes:",mygenesforterm2_bio))
}
ntop = 48
ggdata <- goEnrichment_bio[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_Bio_UL_BP <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
ylim(1,11) +
geom_point(shape = 21) +
scale_size(range = c(2.5,12.5)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - Biological Process')+
theme_bw(base_size = 24) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 16, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
#axis.title = element_text(size = 12, face = 'bold'),
axis.title.x = element_blank(),
#axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
#Legend
legend.key = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
title = element_text(size = 14, face = "bold")) +
coord_flip()
#dev.off()
plot_Bio_UL_BP + labs(x = NULL)
options(repr.plot.width=10, repr.plot.height=8, repr.plot.res = 500)
#run the topGO function.
GOdata_bio_MF <- new("topGOdata", ontology = "MF", allGenes = geneList_bio,
annot = annFUN.gene2GO, gene2GO = geneID2GO_bio)
results_go_bio_MF <- runTest(GOdata_bio_MF, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment_bio_MF <- GenTable(GOdata_bio_MF, Fisher = results_go_bio_MF, orderBy = "Fisher", topNodes =100, numChar =1000 )
goEnrichment_bio_MF$Fisher <- as.numeric(goEnrichment_bio_MF$Fisher)
goEnrichment_bio_MF <- goEnrichment_bio_MF[goEnrichment_bio_MF$Fisher < 0.05,]
goEnrichment_bio_MF <- goEnrichment_bio_MF[goEnrichment_bio_MF$Significant > 1,]
goEnrichment_bio_MF <- goEnrichment_bio_MF[,c("GO.ID","Term", "Annotated","Significant", "Expected", "Fisher")]
goEnrichment_bio_MF
ntop = 29
ggdata <- goEnrichment_bio_MF[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_Bio_UL_MF <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
ylim(1,11) +
geom_point(shape = 21) +
scale_size(range = c(2.5,12.5)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - Molecular Function')+
theme_bw(base_size = 24) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 16, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
#axis.title = element_text(size = 12, face = 'bold'),
axis.title.x = element_blank(),
#axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
#Legend
legend.key = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
title = element_text(size = 14, face = "bold")) +
coord_flip()
#dev.off()
plot_Bio_UL_MF + labs(x=NULL)
options(repr.plot.width=16, repr.plot.height=8, repr.plot.res = 500)
Import Skogsbergia sp. gene expression matrix which consists of three tissue types - upper lip, compound eye and gut - with five biological replicates for each tissue. Generate the sample name sheet (meta sheet) for downstream DESeq2 analyses. Read in the annotation file for Skogsbergia sp. generated by Trinotate.
#read in gene expression matrix
skogs_counts <- read.delim("skogs_fasta90_isoform_combined.tab", header = TRUE, sep = "\t", quote = "")
#fix the column names
row.names(skogs_counts) <-skogs_counts$X
#remove the column X and extra X.1
skogs_counts$X.1 <- NULL
skogs_counts$X <- NULL
meta <- data.frame(row.names = colnames(skogs_counts))
head(meta)
sample_name = c("Upper_lip", "Eye", "Gut", "Upper_lip", "Eye", "Gut", "Upper_lip", "Eye", "Gut", "Upper_lip", "Eye", "Gut","Upper_lip", "Eye", "Gut")
meta$sample_name <- sample_name
meta$names <- rownames(meta)
rownames(meta) <- NULL
#import count table, meta and sample_name into a DESeq2 object
dds_count_table <- DESeqDataSetFromMatrix(countData = skogs_counts, colData = meta, design = ~sample_name)
nrow(counts(dds_count_table))
# run DESeq2 function and normalization
dds_raw_counts <- DESeq(dds_count_table, betaPrior = FALSE, parallel = TRUE)
# Perform a variance-stabilizing transformation
vsd_raw_counts <- varianceStabilizingTransformation(dds_raw_counts)
sampleDists_raw_counts <- dist(t(assay(vsd_raw_counts)))
#plot the heatmap
sampleDists_raw_counts_Matrix <- as.matrix(sampleDists_raw_counts)
rownames(sampleDists_raw_counts_Matrix) <- paste(colData(dds_raw_counts)$sample_name)
colnames(sampleDists_raw_counts_Matrix) <- colData(dds_raw_counts)$names
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDists_raw_counts_Matrix,
clustering_distance_rows=sampleDists_raw_counts,
clustering_distance_cols=sampleDists_raw_counts,
col=colors)
#plot the PCA
pcaData_raw_counts <- plotPCA(vsd_raw_counts, intgroup="sample_name", returnData=TRUE)
percentVar_raw_counts <- round(100 * attr(pcaData_raw_counts, "percentVar"))
ggplot(pcaData_raw_counts, aes(PC1, PC2, color=sample_name, shape=sample_name)) +
geom_point(size=5) +
xlab(paste0("PC1: ",percentVar_raw_counts[1],"% variance")) +
ylab(paste0("PC2: ",percentVar_raw_counts[2],"% variance")) +
geom_point(size=5) +
theme(panel.grid.major = element_line(colour = "gray97", size = 1), panel.grid.minor = element_line(linetype = "dotted"), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = "gray100")) + theme(axis.line = element_line(size = 0.5,linetype = "solid")) +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
coord_fixed() +
scale_color_manual(values = c('#A5D6A7','#CE93D8', '#81D4FA'))+theme(
legend.title = element_text(size = 16),
legend.text = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 12)
)
options(repr.plot.width=8, repr.plot.height=6, repr.plot.res = 150)
#read in the Trinotate sheet for Skogsbergia sp.
Trinotate_lym_subset_skogs <- read.csv(file = "Trinotate_lym_subset_Skogsbergia_cdhit90_longestisoform.csv")
Trinotate_lym_subset_skogs$X <- NULL
colnames(Trinotate_lym_subset_skogs)[1]<- "gene_id"
Differential gene expression analysis was carried out in DESeq2 (Love et al., 2014). For Skogsbergia sp. , we determined differentially upregulated genes in three tissue types - upper lip, compound eye and gut - using five biological replicates for each tissue. DESeq2 was employed using a p-value < 0.05 and FC > 1.5 for the significance of differentially expressed genes using the Benjamini-Hochberg method to account for false discovery rate (FDR). Pairwise comparisons were done across tissue types (i.e., upper lip to compound eye, upper lip to gut, gut to compound eye). To determine tissue-specific differential upregulation, each tissue was compared to the other two (e.g., genes upregulated uniquely in the upper lip were determined by comparing upper lip expression to both compound eye and gut). For each pairwise comparison, the reference tissue was specified to determine the significantly upregulated genes in each tissue type (i.e., positive vs negative log2fold change). For example,in the upper lip vs eye comparison, if a gene has a positive log2fold change, then that gene is upregulated in the upper lip compared to the eye. Conversely, if a gene has a negative log2fold change then the gene is upregulated in eye compared to the upper lip. Genes that were upregulated uniquely in each tissue type were used for all downstream analyses.
dds_count_table
#run DESeq2 analysis
dds_DE <- DESeq(dds_count_table)
#set Eyes as a reference tissue
dds_DE$sample_name <- relevel(dds_DE$sample_name, ref= "Eye")
#rerun DESeq command after reference is specified
dds_DE <- DESeq(dds_DE)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_UpperlipVs_Eye <- results(dds_DE, contrast= c("sample_name", "Upper_lip", "Eye") , alpha = 0.05)
res_tableOE_UpperlipVs_Eye <- lfcShrink(dds_DE, contrast= c("sample_name", "Upper_lip", "Eye"), res = res_tableOE_unshrunken_UpperlipVs_Eye)
mcols(res_tableOE_UpperlipVs_Eye, use.names=T)
#set thresholds
#lfc.cutoff value of 0.58 translates to a 1.5-fold change in expression
#padj.cutoff value of 0.05
padj.cutoff <- 0.05
lfc.cutoff <- 0.58
#convert to tibble
res_tableOE_UpperlipVs_Eye_tb <- res_tableOE_UpperlipVs_Eye %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
#to determine all differentially expressed genes that meet the cutoff values (i.e., padj.cutoff of 0.05 and lfc.cutoff of 0.58, both upregulated and downregulated)
sigOE_UpperlipVs_Eye <- res_tableOE_UpperlipVs_Eye_tb %>% filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
colnames(sigOE_UpperlipVs_Eye)[1]<- "transcript_id"
#extract all genes that are significantly upregulated in the Upper Lip (positive log2 fold change)
Upperlip_Vs_eye_sigOE_UPREGULATED_logfold <- sigOE_UpperlipVs_Eye %>% filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
#add the annotation
Upperlip_Vs_eye_sigOE_UPREGULATED_logfold_annot <- setDT(Trinotate_lym_subset_skogs, key = 'transcript_id')[J(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)]
#extract all genes that are significantly upregulated in the Compound Eye (negative log2fold change) compared to the Upper Lip.
Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold <- sigOE_UpperlipVs_Eye %>% filter(padj < padj.cutoff & log2FoldChange < -lfc.cutoff)
#save these two dataframes for downstream analysis in Section 4.4
#genes that are significantly upregulated in the Upper Lip (positive log2 fold change)
head(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)
#genes that significantly upregulated in the Compound Eye (negative log2fold change) compared to the Upper Lip.
head(Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_GutVsEye <- results(dds_DE, contrast= c("sample_name", "Gut", "Eye") , alpha = 0.05)
res_tableOE_GutVsEye <- lfcShrink(dds_DE, contrast= c("sample_name", "Gut", "Eye"), res = res_tableOE_unshrunken_GutVsEye)
mcols(res_tableOE_GutVsEye, use.names=T)
#convert to tibble
res_tableOE_tb_GutVsEye <- res_tableOE_GutVsEye %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
#determine all differentially expressed genes that meet the cutoff values (i.e., padj.cutoff of 0.05 and lfc.cutoff of 0.58, both upregulated and downregulated)
sigOE_GutVsEye <- res_tableOE_tb_GutVsEye %>% filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
colnames(sigOE_GutVsEye)[1]<- "transcript_id"
#extract all genes that are significantly upregulated in the Gut (positive log2fold change)
GutVsEye_sigOE_UPREGULATED_logfold <- sigOE_GutVsEye %>% filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
#add the annotation
GutVsEye_sigOE_UPREGULATED_logfold_annot <- setDT(Trinotate_lym_subset_skogs, key = 'transcript_id')[J(GutVsEye_sigOE_UPREGULATED_logfold)]
#extract all genes that are significantly upregulated in the Compound Eye (negative log2fold change) compared to the Gut.
GutVsEye_sigOE_DOWNREGULATED_logfold <- sigOE_GutVsEye %>% filter(padj < padj.cutoff & log2FoldChange < -lfc.cutoff)
#save these two dataframes for downstream analysis
#genes that are significantly upregulated in the Gut (positive log2 fold change)
head(GutVsEye_sigOE_UPREGULATED_logfold)
#genes that are significantly upregulated in the Compound Eye (negative log2fold change) but that are downregulated in the gut
head(GutVsEye_sigOE_DOWNREGULATED_logfold)
#now set gut as a reference tissue
dds_DE$sample_name <- relevel(dds_DE$sample_name, ref= "Gut")
#rerun DESeq
dds_DE <- DESeq(dds_DE)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_UpperLipVsGut <- results(dds_DE, contrast= c("sample_name", "Upper_lip", "Gut") , alpha = 0.05)
res_tableOE_UpperLipVsGut <- lfcShrink(dds_DE, contrast= c("sample_name", "Upper_lip", "Gut"), res = res_tableOE_unshrunken_UpperLipVsGut)
mcols(res_tableOE_UpperLipVsGut, use.names=T)
# convert to tibble
res_tableOE_tb_UpperLipVsGut <- res_tableOE_UpperLipVsGut %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
#to determine all differentially expressed genes that meet the cutoff values (i.e., padj.cutoff of 0.05 and lfc.cutoff of 0.58, both upregulated and downregulated)
sigOE_UpperLipVsGut <- res_tableOE_tb_UpperLipVsGut %>% filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
#extract all genes that are significantly upregulated in the upper lip (positive log2 fold change)
UpperLipVsGut_sigOE_UPREGULATED_logfold <- sigOE_UpperLipVsGut %>% filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
#genes that are significantly upregulated in the upper lip (positive log2 fold change)
head(UpperLipVsGut_sigOE_UPREGULATED_logfold )
#extract all genes that are significantly upregulated in the Gut (negative log2fold change) compared to the upper lip
UpperLipVsGut_sigOE_DOWNREGULATED_logfold <- sigOE_UpperLipVsGut %>%
filter(padj < padj.cutoff & log2FoldChange < -lfc.cutoff)
#extract all genes that are significantly upregulated in the Gut (negative log2fold change) compared to the Upper Lip.
head(UpperLipVsGut_sigOE_DOWNREGULATED_logfold)
#save these two dataframes for downstream analysis
#genes that are significantly upregulated in the Upper Lip (positive log2fold change)
head(UpperLipVsGut_sigOE_UPREGULATED_logfold)
#genes that are significantly upregulated in the Gut (negative log2fold change) compared to the Upper Lip
head(UpperLipVsGut_sigOE_DOWNREGULATED_logfold)
To identify tissue-specific differential expression (i.e., significantly upregulated genes that are uniquely expressed), each tissue was compared to the other two and tissue-specific genes were extracted from the intersection of the Venn diagram.Genes that were upregulated uniquely in each tissue type were used for all downstream analyses in the publication.
#combine genes upregulated in upper lips (Upper Lip vs Gut and Upper Lip vs Eye) from both pairwise comparisons into a single dataframe
head(UpperLipVsGut_sigOE_UPREGULATED_logfold)
head(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)
colnames(UpperLipVsGut_sigOE_UPREGULATED_logfold)[1]<- "gene"
colnames(Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)[1]<- "gene"
#merge
merged_upper_lips_df <- rbind(UpperLipVsGut_sigOE_UPREGULATED_logfold, Upperlip_Vs_eye_sigOE_UPREGULATED_logfold)
#remove gene duplicates while retaining one duplicate due to the pairwise comparisons (Upper Lip vs Gut and Upper Lip vs Eye)
merged_upper_lips_unique <- merged_upper_lips_df[!duplicated(merged_upper_lips_df$gene), ]
head(merged_upper_lips_unique)
#write.csv(merged_upper_lips_unique, file = "SKOGS_merged_upper_lips_ALL_DE_genes.csv")
#combine genes upregulated in compound eye (Upper Lip vs Compound Eye and Gut vs Compound Eye) from both pairwise comparisons into a single dataframe
#genes that significantly upregulated in the Compound Eye
head(Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold)
#genes that are significantly upregulated in the Compound Eye
head(GutVsEye_sigOE_DOWNREGULATED_logfold)
colnames(Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold)[1]<- "gene"
colnames(GutVsEye_sigOE_DOWNREGULATED_logfold)[1]<- "gene"
#merged eye
merged_Eye_df <- rbind(Upperlip_Vs_eye_sigOE_DOWNREGULATED_logfold , GutVsEye_sigOE_DOWNREGULATED_logfold)
head(merged_Eye_df)
#remove gene duplicates while retaining one duplicate due to the pairwise comparisons (Upper Lip vs Compound Eye and Gut vs Compound Eye).
merged_Eye_unique <-merged_Eye_df[!duplicated(merged_Eye_df$gene), ]
head(merged_Eye_unique)
#write.csv(merged_Eye_unique, file = "SKOGS_merged_Eye_ALL_DE_genes.csv")
#combine genes upregulated in Gut (Gut vs Eye and Upper lip vs Gut) from both pairwise comparisons into a single dataframe
#genes that are significantly upregulated in the Gut
head(GutVsEye_sigOE_UPREGULATED_logfold)
#genes that are significantly upregulated in the Gut
head(UpperLipVsGut_sigOE_DOWNREGULATED_logfold)
colnames(GutVsEye_sigOE_UPREGULATED_logfold)[1]<- "gene"
colnames(UpperLipVsGut_sigOE_DOWNREGULATED_logfold)[1]<- "gene"
#merge
merged_Gut_df <- rbind(GutVsEye_sigOE_UPREGULATED_logfold , UpperLipVsGut_sigOE_DOWNREGULATED_logfold)
head(merged_Gut_df)
#remove gene duplicates while retaining one duplicate due to the pairwise comparisons (Gut vs Eye and Upper lip vs Gut)
merged_Gut_unique <-merged_Gut_df[!duplicated(merged_Gut_df$gene), ]
head(merged_Gut_unique)
#write.csv(merged_Gut_unique, file = "SKOGS_merged_Gut_ALL_DE_genes.csv")
#generate a venn diagram to visualize shared significantly upregulated genes across tissue types and extract the genes that are unique to each tissue type.
unique_venn_list <- list(
Upper_Lip = merged_upper_lips_unique$gene ,
Gut = merged_Gut_unique$gene,
Compound_Eye = merged_Eye_unique$gene
)
ggvenn_unique <- ggvenn(
unique_venn_list,
fill_color = c('#81D4FA','#A5D6A7', '#CE93D8'),
stroke_size = .7, set_name_size = 6, text_size = 5
)
ggvenn_unique
# Open a PDF device
pdf("Non_Luminous_DGE.pdf", width = 8, height = 6)
ggvenn_unique
dev.off()
#prep dataframes for extraction
Upper_Lip <- as.data.frame(merged_upper_lips_unique$gene)
colnames(Upper_Lip)[1]<- "gene"
Gut <- as.data.frame(merged_Gut_unique$gene)
colnames(Gut)[1]<- "gene"
Compound_eye <- as.data.frame(merged_Eye_unique$gene)
colnames(Compound_eye)[1]<- "gene"
# compare and extract unique genes for each tissue type
unique_genes_upper_lip <- anti_join(Upper_Lip, Gut, by = "gene") %>%
anti_join(Compound_eye, by = "gene")
unique_genes_gut <- anti_join(Gut, Upper_Lip, by = "gene") %>%
anti_join(Compound_eye, by = "gene")
unique_genes_compound_eye <- anti_join(Compound_eye, Upper_Lip, by = "gene") %>%
anti_join(Gut, by = "gene")
nrow(unique_genes_upper_lip)
nrow(unique_genes_gut)
nrow(unique_genes_compound_eye)
#add the annotations back to the unique genes in each tissue type by subsetting
unique_genes_upper_lip_info <- merged_upper_lips_unique %>%
filter(gene %in% unique_genes_upper_lip$gene)
unique_genes_eye_info <- merged_Eye_unique %>%
filter(gene %in% unique_genes_compound_eye$gene)
unique_genes_gut_info <- merged_Gut_unique %>%
filter(gene %in% unique_genes_gut$gene)
#add annotations back for upper lip
colnames(unique_genes_upper_lip_info)[1]<- "transcript_id"
unique_genes_upper_lip_info_annot <- left_join(unique_genes_upper_lip_info,Trinotate_lym_subset_skogs,by="transcript_id")
#write.csv(unique_genes_upper_lip_info_annot, file = "df_Skogs_sigfig_upreg_unique_Upper_Lip.csv")
#add annotations back for eyes
colnames(unique_genes_eye_info)[1]<- "transcript_id"
unique_genes_eye_info_annot <- left_join(unique_genes_eye_info,Trinotate_lym_subset_skogs,by="transcript_id")
#write.csv(unique_genes_eye_info_annot, file = "df_Skogs_sigfig_upreg_unique_comEye.csv")
#add annotations back for gut
colnames(unique_genes_gut_info)[1] <- "transcript_id"
unique_genes_gut_info_annot <- left_join(unique_genes_gut_info,Trinotate_lym_subset_skogs,by="transcript_id")
#write.csv(unique_genes_gut_info_annot, file = "df_Skogs_sigfig_upreg_unique_Gut.csv")
#import the trinotate go sheet from Trinotate output
geneID2GO <- readMappings(file ="go_annotations_cdhit90_longestisoform.txt")
#save the transcript ids of all the annotated genes under geneNames object
geneNames<- as.character(Trinotate_lym_subset_skogs$transcript_id)
#significantly upregulated genes (i.e. expressed uniquely) in the upper lip
head(unique_genes_upper_lip_info)
#save the transcript
myInterestingGenes= as.character(unique_genes_upper_lip_info$transcript_id)
#subset the genesNames by the transcript IDs in my red module
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
head(geneList)
#run the topGO function for BP
GOdata <- new("topGOdata", ontology = "BP", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
results_go <- runTest(GOdata, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment <- GenTable(GOdata, Fisher = results_go, orderBy = "Fisher", topNodes = 10000, numChar=1000)
goEnrichment$Fisher <- as.numeric(goEnrichment$Fisher)
goEnrichment <- goEnrichment[goEnrichment$Fisher < 0.05,]
goEnrichment <- goEnrichment[goEnrichment$Significant > 1,]
myterms =goEnrichment$GO.ID
mygenes = genesInTerm(GOdata, myterms)
#extract the transcript ids for each GO term
var=c()
for (i in 1:length(myterms))
{
myterm <- myterms[i]
mygenesforterm <- mygenes[myterm][[1]]
myfactor <- mygenesforterm %in% myInterestingGenes
mygenesforterm2 <- mygenesforterm[myfactor == TRUE]
mygenesforterm2 <- paste(mygenesforterm2, collapse=',')
var[i]=paste(myterm,"genes:",mygenesforterm2)
}
# GO enrichment Upper Lip - BP
ntop = 50
ggdata <- goEnrichment[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_GO_UL_BP <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
expand_limits(y = 1) +
geom_point(shape = 21) +
scale_size(range = c(2,7)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - Biological Process')+
theme_bw(base_size = 24) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 16, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
legend.key = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
title = element_text(size = 14, face = "bold")) +
coord_flip()
#dev.off()
plot_GO_UL_BP + labs(x = NULL)
#library (repr)
options(repr.plot.width=12, repr.plot.height=8, repr.plot.res = 500)
#run the topGO function
GOdata_MF <- new("topGOdata", ontology = "MF", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
results_go_MF <- runTest(GOdata_MF, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment_MF <- GenTable(GOdata_MF, Fisher = results_go_MF, orderBy = "Fisher", topNodes = 100, numChar=1000)
#lets graph the GO enrichment
goEnrichment_MF$Fisher <- as.numeric(goEnrichment_MF$Fisher)
goEnrichment_MF <- goEnrichment_MF[goEnrichment_MF$Fisher < 0.05,]
goEnrichment_MF <- goEnrichment_MF[goEnrichment_MF$Significant > 1,]
# GO enrichment Upper Lip - MF
ntop = 22
ggdata <- goEnrichment_MF[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_GO_UL_MF <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
expand_limits(y = 1) +
geom_point(shape = 21) +
scale_size(range = c(2,7)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - Molecular Function')+
theme_bw(base_size = 24) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 16, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
legend.key = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
title = element_text(size = 14, face = "bold")) +
coord_flip()
#dev.off()
plot_GO_UL_MF +labs(x=NULL)
#library (repr)
options(repr.plot.width=8, repr.plot.height=5)
P. Langfelder, S. Horvath, WGCNA: an R package for weighted correlation network analysis. BMC Bioinformatics 9, 559 (2008)
M. I. Love, W. Huber, S. Anders, Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 15, 550 (2014)
Alexa A, Rahnenfuhrer J, topGO: Enrichment Analysis for Gene Ontology (2024)
Shannon, Paul et al., Cytoscape: a software environment for integrated models of biomolecular interaction networks. Genome research vol. 13,11 (2003)